from clustergrammer2 import net
import clustergrammer_groupby as cby
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
df = {}
df['ccle'] = pd.read_csv('../data/CCLE/CCLE.txt.gz', compression='gzip', index_col=0)
net.load_df(df['ccle'])
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['var'] = net.export_df().round(2)
genes_top_var = df['var'].index.tolist()
net.load_df(df['var'])
net.normalize(axis='row', norm_type='zscore')
df_tmp = net.export_df().round(2)
net.enrichrgram('GO_Biological_Process_2018')
df['enrichrgram'] = net.export_df()
net.widget()
df['enrichrgram'].index.tolist()[0]
df['var'].head()
# enrichrgram_row = df_enr.index.tolist()[0][1:]
# enrichrgram_row = [x.split(': ')[0] + ' '+ x.split('Pval')[1] for x in enrichrgram_row]
# enrichrgram_row
from copy import deepcopy
from ast import literal_eval as make_tuple
cols = df['var'].columns.tolist()
new_cols = [make_tuple(x) for x in cols]
df['var-cat'] = deepcopy(df['var'])
df['var-cat'].columns = new_cols
all_genes = df['ccle'].index.tolist()
len(all_genes)
new_rows = [(x,) for x in df['var-cat'].index.tolist()]
df['var-tuple'] = deepcopy(df['var-cat'])
df['var-tuple'].index = new_rows
net.load_df(df['var-tuple'])
net.normalize(axis='row', norm_type='zscore')
df_tmp = net.export_df().round(2)
net.load_df(df_tmp)
net.cluster()
net.dendro_cats(axis='row', dendro_level=5)
df['dendro'] = net.export_df()
net.widget()
rows = df['dendro'].index.tolist()
immune_genes = sorted([x[0] for x in rows if x[1] == 'Group 5: cat-5'])
len(immune_genes)
rows = df['dendro'].index.tolist()
cns_genes = sorted([x[0] for x in rows if x[1] == 'Group 5: cat-9'])
len(cns_genes)
from glob import glob
all_files = glob('../data/Enrichr_Libraries_of_Interest/*.txt')
all_files
gmts = {}
for inst_file in all_files:
inst_lib = inst_file.split('/')[-1].replace('.txt','')
gmts[inst_lib] = net.load_gmt(inst_file)
print(inst_lib)
from scipy.stats import binom_test
def enrich_gene_list_using_lib(libs, lib_name, gene_list, background_list, pval_cutoff=0.05):
lib_json = libs[lib_name]
len_gene_list = len(gene_list)
list_terms = []
list_pval = []
# list of series that will be used to make dataframe
list_term_ser = []
for inst_term in lib_json:
term_ser = pd.Series(data=np.zeros(len(gene_list)), index=gene_list)
term_genes = lib_json[inst_term]
p_expect = len(set(all_genes).intersection(term_genes))/len(all_genes)
# print(gene_list)
found_genes = list(set(gene_list).intersection(term_genes))
# print('found_genes', len(found_genes))
actual_k = len(found_genes)
# set found genes to one
term_ser[found_genes] = 1
if actual_k/len_gene_list > p_expect:
inst_pval = binom_test(actual_k, len_gene_list, p_expect)
else:
inst_pval = 0.5
# print('HERE', inst_pval, actual_k, len_gene_list, p_expect)
if inst_pval < pval_cutoff:
term_name = (inst_term, 'Library: ' + lib_name, 'Pval: ' + str(inst_pval))
list_terms.append(term_name)
list_pval.append(inst_pval)
term_ser.name = term_name
list_term_ser.append(term_ser)
ser_pval = pd.Series(data=list_pval, index=list_terms).sort_values()
# rank df_enr by pval
df_enr = pd.concat(list_term_ser, axis=1)[ser_pval.index.tolist()]
return ser_pval, df_enr
libraries_of_intersest = ['GO_Biological_Process_2018',
'MGI_Mammalian_Phenotype_2017', 'ChEA_2016',
'Disease_Perturbations_from_GEO_up', 'ARCHS4_TFs_Coexp',
'ARCHS4_Tissues']
keep_num_terms = 5
inst_genes = immune_genes
df_list = []
for inst_lib in libraries_of_intersest:
ser_pval, df_enr = enrich_gene_list_using_lib(gmts, inst_lib, inst_genes, all_genes)
df_list.append(df_enr.iloc[:, :keep_num_terms])
df['enr-immune'] = pd.concat(df_list, axis=1)
df['enr-immune'].shape
net.load_df(df['enr-immune'])
net.filter_N_top(inst_rc='row', N_top=500, rank_type='sum')
net.cluster(dist_type='jaccard')
net.widget()
inst_genes = cns_genes
df_list = []
for inst_lib in libraries_of_intersest:
ser_pval, df_enr = enrich_gene_list_using_lib(gmts, inst_lib, inst_genes, all_genes)
df_list.append(df_enr.iloc[:, :keep_num_terms])
df['enr-cns'] = pd.concat(df_list, axis=1)
df['enr-cns'].shape
net.load_df(df['enr-cns'])
net.filter_N_top(inst_rc='row', N_top=500, rank_type='sum')
net.cluster(dist_type='jaccard')
net.widget()
net.load_df(df_term.loc[keep_rows])
net.swap_nan_for_zero()
net.widget()
from copy import deepcopy
def enrich_dataframe_using_lib(lib_json, df_ini, background_list, num_top_terms, pval_cutoff=0.05):
gene_list = df_ini.index.tolist()
ser_pval, df_term = enrich_gene_list_using_lib(lib_json, gene_list, background_list,
pval_cutoff=pval_cutoff)
keep_terms = ser_pval.index.tolist()[:num_top_terms]
df_term = df_term[keep_terms]
# drop pval, going to use the gmt json to add categories to rows
keep_term_names = [x[0] for x in keep_terms]
rows_ini = df_ini.index.tolist()
new_rows = []
for inst_gene in rows_ini:
new_row = (inst_gene,)
for inst_term in keep_terms:
inst_term_name = inst_term[0]
inst_term_pval = inst_term[1]
term_list = lib_json[inst_term_name]
inst_found = 'False'
if inst_gene in term_list:
inst_found = 'True'
inst_cat = inst_term_name + ': ' + inst_found + '<p> ' + inst_term_pval + '</p>'
new_row = new_row + (inst_cat,)
new_rows.append(new_row)
df_cat = deepcopy(df_ini)
df_cat.index = new_rows
return df_cat, ser_pval, df_term
df_tmp.shape
len(all_genes)
df_var.index.tolist()[0]
df_cat, ser_pval, df_term = enrich_dataframe_using_lib(gmts['go-process'], df_var, all_genes, 10)
df_cat.shape
df_cat.index.tolist()[0]
df_enr.index.tolist()[0]
net.load_df(df_cat)
net.widget()
genes_1k = [x[0] for x in df.index.tolist()]
genes_1k[:10]
df_var.index.tolist()[:10]
ser_pval, df_term = enrich_gene_list_using_lib(gmts['go-process'], df_var.index.tolist(), all_genes)
df_term.shape
ser_pval, df_term = enrich_gene_list_using_lib(gmts['kea'], df_var.index.tolist(), all_genes)
df_term.shape
df_term.head()
df_term.shape
genes_1k = [x[0] for x in rows]
ser_pval, df_term = enrich_gene_list_using_lib(gmts['kea'], genes_1k, all_genes)
keep_top_enr = ser_pval.index.tolist()[:10]
net.load_df(df_term[keep_top_enr])
net.widget()
rows[:10]
len(all_genes)
len(genes_top_var)
ser_pval, df_term = enrich_gene_list_using_lib(gmts['go-process'], genes_top_var, all_genes, 0.01)
df_term.shape
enrichrgram_row
keep_top_enr = ser_pval.index.tolist()[:10]
keep_top_enr = [x[0] + ' ' + x[1] for x in keep_top_enr]
keep_top_enr
net.load_df(df_term[keep_top_enr])
net.widget()
len(gene_list)
df_sig, keep_genes_dict, df_gene_pval, fold_info = cby.generate_signatures(df,
'tissue', num_top_dims=100)
net.load_df(df_sig)
net.widget()
gene_sig = df_sig.idxmax(axis=1)
gs_dict = {}
for inst_gene in gene_sig.index.tolist():
gs_dict[inst_gene] = gene_sig[inst_gene][0]
rows = df.index.tolist()
new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]
df.index = new_rows
net.load_df(df)
ct_color = net.viz['cat_colors']['col']['cat-0']
def set_cat_colors(axis, cat_index, cat_title=False):
for inst_ct in ct_color:
if cat_title != False:
cat_name = cat_title + ': ' + inst_ct
else:
cat_name = inst_ct
inst_color = ct_color[inst_ct]
net.set_cat_color(axis=axis, cat_index=cat_index, cat_name=cat_name, inst_color=inst_color)
set_cat_colors('row', 1)
net.load_df(df)
net.widget()